{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import itertools\n",
    "import pickle\n",
    "import hickle \n",
    "import gzip\n",
    "import operator\n",
    "import os\n",
    "import sys\n",
    "from time import time\n",
    "import pprint as pp\n",
    "import collections\n",
    "import ConfigParser\n",
    "from collections import Counter\n",
    "from operator import itemgetter\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import twitter\n",
    "\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import Normalizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.cluster import KMeans, MiniBatchKMeans\n",
    "from sklearn.externals import joblib\n",
    "from sklearn.feature_extraction import text \n",
    "\n",
    "# bokeh\n",
    "import bokeh.plotting as bkplt\n",
    "from bokeh.charts import *\n",
    "from bokeh.io import output_notebook\n",
    "from bokeh.charts import Histogram, show\n",
    "\n",
    "# import requirments \n",
    "from IPython.display import Image\n",
    "from IPython.display import display\n",
    "import matplotlib.pyplot as plt\n",
    "import json\n",
    "import rpy2\n",
    "%load_ext rpy2.ipython\n",
    "%R require(\"ggplot2\")\n",
    "% matplotlib inline\n",
    "from ggplot import *\n",
    "randn = np.random.randn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup & Creds\n",
    "Let's grab some json records from Twitter's public api. \n",
    "\n",
    "We'll use python-twitter. \n",
    "<pre>\n",
    "$ pip install python-twitter\n",
    "$ pydoc twitter.Api\n",
    "</pre>\n",
    "\n",
    "Build an app [https://apps.twitter.com/](https://apps.twitter.com/).  \n",
    "\n",
    "Then use the app info in the `config.cfg` file.\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# read the config file.\n",
    "config = ConfigParser.RawConfigParser()\n",
    "config.read('config.cfg')\n",
    "\n",
    "# creds found in your Twitter app. See https://apps.twitter.com/\n",
    "token = config.get('oauth','token')\n",
    "token_secret = config.get('oauth','token_secret')\n",
    "con_key = config.get('oauth','con_key')\n",
    "con_secret_key = config.get('oauth','con_secret_key')\n",
    "\n",
    "# setup \n",
    "api = twitter.Api(\n",
    "    consumer_key=con_key\n",
    "    , consumer_secret=con_secret_key\n",
    "    , access_token_key = token\n",
    "    , access_token_secret = token_secret)\n",
    "\n",
    "# test creds\n",
    "print \"@{}\".format(api.VerifyCredentials().GetScreenName())\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# get ~5000 tweets from the public API.\n",
    "results = api.GetSearch(term = 'golden retriever', count = 100, include_entities=True)                \n",
    "counter = 1\n",
    "total_tweets = 5000\n",
    "tweets = []\n",
    "while counter <= total_tweets:\n",
    "    if counter == 1:\n",
    "        new_results = api.GetSearch(term = 'golden retriever'\n",
    "                                    , count = 100\n",
    "                                    , max_id = results[-1].GetId()\n",
    "                                    , include_entities=True)\n",
    "    else:\n",
    "        new_results = api.GetSearch(term = 'golden retriever'\n",
    "                                    , count = 100\n",
    "                                    , max_id = new_results[-1].GetId()\n",
    "                                    , include_entities=True)\n",
    "    counter += len(new_results)\n",
    "    tweets.extend(new_results)\n",
    "# store tweets\n",
    "tweet_text = [tweet.GetText() for tweet in tweets]\n",
    "print len(tweet_text)\n",
    "pickle.dump(tweet_text,open('./data/tweet_text.pkl','wb'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train/Test set\n",
    "Split the training and test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Set up a training and test set.\n",
    "\n",
    "def create_index(total_tweets):\n",
    "        \"\"\"\n",
    "        Builds an index for the training and test set.\n",
    "        The sets serve as a list of row numbers to extract from the dataset. \n",
    "        \"\"\"\n",
    "        # based on the total tweet count, create an array of all line numbers \n",
    "        line_index = np.array(range(0,total_tweets))\n",
    "        # split the array into training and test sets of index values\n",
    "        trainIndex,testIndex = train_test_split(line_index,train_size=0.70, random_state=42)\n",
    "        # save test & traning index values\n",
    "        #np.save(\"training_index\",trainIndex)\n",
    "        #np.save(\"testing_index\",testIndex)\n",
    "        return trainIndex,testIndex\n",
    "\n",
    "# build indicies \n",
    "trainIndex,testIndex = create_index(len(tweet_text))\n",
    "pickle.dump(trainIndex,open('data/trainIndex.pkl','wb'))\n",
    "pickle.dump(testIndex,open('data/testIndex.pkl','wb'))\n",
    "\n",
    "# build test set\n",
    "test_tweets = [tweet_text[i] for i in testIndex]\n",
    "pickle.dump(test_tweets,open('data/test_tweets.pkl','wb'))\n",
    "\n",
    "train_tweets = [tweet_text[i] for i in trainIndex]\n",
    "pickle.dump(test_tweets,open('data/train_tweets.pkl','wb'))\n",
    "print \"train: {:,}\".format(len(train_tweets))\n",
    "print \"test: {:,}\".format(len(test_tweets))\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vectorize the Tweets\n",
    "Two steps:  \n",
    "1.  Set up a vectorizer.\n",
    "2.  Vectorize the tweets to build the vocabulary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Set up a vecterizer.\n",
    "# see http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n",
    "\n",
    "def vectorize_1():\n",
    "    vectorizer = TfidfVectorizer(#min_df=20\n",
    "                                 stop_words='english'\n",
    "                                 #, sublinear_tf=True\n",
    "                                 , use_idf=True # enable inverse-document-frequency reweighting\n",
    "                                 , ngram_range=(1,2) # given our vocab, not really necessary\n",
    "                                 , binary = True # presence of word instead of frequency\n",
    "                                 #, vocabulary = vocab\n",
    "                                ) \n",
    "    #X = vectorizer.fit_transform(tweet_list)\n",
    "    return vectorizer\n",
    "\n",
    "def vectorize_2(vocab):\n",
    "    vectorizer = CountVectorizer(stop_words='english'\n",
    "                                 , ngram_range=(1,2) # given our vocab, not really necessary\n",
    "                                 , binary = True # presence of word instead of frequency\n",
    "                                 , vocabulary = set(vocab)\n",
    "                                ) \n",
    "    #X = vectorizer.fit_transform(tweet_list)\n",
    "    return vectorizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Vectorize the tweets to build the vocabulary.\n",
    "vectorizer = vectorize_1()\n",
    "X = vectorizer.fit_transform(train_tweets)\n",
    "vocab = vectorizer.get_feature_names()\n",
    "pickle.dump(vocab, open('./data/vocab'+str(X.get_shape()[1])+'.pkl','wb')) \n",
    "print \"Totals prior to applying SVD:\"\n",
    "print \" tweets: {:,}\".format(X.get_shape()[0])\n",
    "print \" vocabulary terms: {:,}\".format(X.get_shape()[1])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "### Dimension Reduction\n",
    "\n",
    "To choose the appropriate number of svd components, we need to explore the amount of variance explained with each component. We'll reduce the number of components that explain approximately 90% of the variance.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#explained_variances = np.var(X_svd, axis=0) / np.var(X_train, axis=0).sum()\n",
    "def create_svd_doc_term_matrix(X_train, num_eigen_vectors=100):\n",
    "    \"\"\"\n",
    "    Create the array with truncated svd.\n",
    "    \"\"\"\n",
    "    # Build the fuction to create the svd space\n",
    "    svd = TruncatedSVD(n_components = num_eigen_vectors)\n",
    "    # Apply normalization in place to each row of the data \n",
    "    pipeline = make_pipeline(svd, Normalizer(copy=False))\n",
    "    return pipeline.fit_transform(X_train), svd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "explained_variance_list = []\n",
    "# The number of svd components to explore\n",
    "svd_component_range = range(1,100,10)\n",
    "\n",
    "# finds the explained variance for each number of components\n",
    "for i in svd_component_range:\n",
    "    # find explained variance (i in this case is the number of components to use)\n",
    "    X_svd, svd = create_svd_doc_term_matrix(X,i)\n",
    "    explained_variance_list.append(svd.explained_variance_ratio_.sum())\n",
    "\n",
    "expVar = pd.DataFrame({'explained_var':explained_variance_list\n",
    "                   , 'components':svd_component_range})\n",
    "display(expVar)\n",
    "\n",
    "display(expVar.plot(x='components',y='explained_var'))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on the above graphs, we determine the number of the svd components to use. Typically, we want to shoot for about 90% of the variance to be explained in the data set. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "n_components = 25\n",
    "X_svd, svd = create_svd_doc_term_matrix(X,n_components)\n",
    "pickle.dump(svd, open('./data/svd_comp'+str(n_components)+'.pkl','wb')) \n",
    "X_svd.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print \"Totals prior to applying SVD:\"\n",
    "print \" tweets: {:,}\".format(X_svd.shape[0])\n",
    "print \" vocabulary terms: {:,}\".format(X_svd.shape[1]) # it's not exactly correct to say that these are vocabulary terms since each dimension is a linear combination of many terms in SVD space, but the point is that we're redicusing the dimensionality. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Cluster Centroids\n",
    "We'll now apply kmeans to find the centroids that will be used to predict a cluster for each tweet."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_clusters(X_svd, k=5):\n",
    "    \"\"\"\n",
    "    Use kmeans to find centroids.\n",
    "    \"\"\"\n",
    "    km = KMeans(n_clusters=k\n",
    "                , init='k-means++'\n",
    "                , max_iter=100\n",
    "                #, n_init=10\n",
    "                , verbose=False)\n",
    "    km.fit(X_svd)\n",
    "    pred=km.predict(X_svd)\n",
    "    pred_df=pd.DataFrame(pred)\n",
    "    pred_df.columns=['pred_cluster']\n",
    "    return km.cluster_centers_ , pred_df, k, km\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Choosing k, the number of clusters, can involve much more analaysis than this tutorial is targeting. See [@jrmontag](https://twitter.com/jrmontag)'s insightful tutorial for more deatils: [choosing-k-in-kmeans](https://github.com/DrSkippy/Data-Science-45min-Intros/tree/master/choosing-k-in-kmeans)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Choose number of clusters\n",
    "my_k = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Build centroids\n",
    "centroids, predictions, n_clusters, km = build_clusters(X_svd, my_k)\n",
    "pickle.dump(centroids, open('./data/centroids'+str(n_clusters)+'.pkl','wb'))\n",
    "pickle.dump(predictions, open('./data/predictions'+str(n_clusters)+'.pkl','wb')) \n",
    "pickle.dump(km, open('./data/km'+str(n_clusters)+'.pkl','wb')) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Explore Word Loadings\n",
    "Those tweets nearest the cluster centers are used as an approximation for their meanings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "original_space_centroids = svd.inverse_transform(centroids)\n",
    "order_centroids = original_space_centroids.argsort()[:, ::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in range(my_k):\n",
    "    print(\"Top words for cluster %d:\" % i)\n",
    "    for ind in order_centroids[i, :20]:\n",
    "        print(' %s' % vocab[ind])\n",
    "    print"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If these clusters seem opaque, we might want to start manipulating the features. Featuring engineering is a broad topic beyond the extent of this tutorial. One suggestion: consider using only the nouns from the tweets to build the vocabulary used in the vectorizer. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Label New Tweets\n",
    "Apply the model to the test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class TopicModel():\n",
    "    \"\"\"\n",
    "    Label new tweets w/ previously established centroids and vocabulary.\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        \"\"\"\n",
    "        Load and initialize any external models or data here.\n",
    "        \"\"\"\n",
    "        self.km = pickle.load(open('./data/km'+str(n_clusters)+'.pkl')) \n",
    "        self.svd = pickle.load(open('./data/svd_comp'+str(n_components)+'.pkl'))\n",
    "        self.vocab = pickle.load(open('./data/vocab'+str(X.get_shape()[1])+'.pkl'))\n",
    "        self.stop_words = text.ENGLISH_STOP_WORDS.union(['http','https','amp'])\n",
    "        self.vectorizer = CountVectorizer(\n",
    "                stop_words=self.stop_words\n",
    "                , binary = True # presence of word instead of frequency\n",
    "                , vocabulary = self.vocab\n",
    "                )\n",
    "        self.tweets = pickle.load(open('./data/test_tweets.pkl')) \n",
    "    def enrichment_value(self):\n",
    "        \"\"\"\n",
    "        Calculates the nearest cluster for an unlabeled tweet using the vocab and cluster centers from the training set.\n",
    "        \"\"\"\n",
    "        tweetTxt = self.tweets\n",
    "        # vectorize the tweet\n",
    "        X = self.vectorizer.fit_transform(tweetTxt)\n",
    "        X_svd = self.svd.transform(X)\n",
    "        labels = self.km.predict(X_svd)\n",
    "        return labels\n",
    "    def __repr__(self):\n",
    "        \"\"\" Add a description of the class's function here \"\"\"\n",
    "        return(\"Tweets vectorized using CountVectorizer to icludes 2grams in the vocab. \\\n",
    "                The 50 topic clusters are built from Twitter data from the public api,  \\\n",
    "                SVD used to reduce dimensions and kmeans for centroids. New tweets are \\\n",
    "                labled by their nearness to centroids. Result returned provide score.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Create instance of model and label tweets\n",
    "model = TopicModel()\n",
    "test_data = model.enrichment_value()\n",
    "type(test_data)\n",
    "data={ str(k):[v] for k,v in Counter(test_data).items()}\n",
    "#print { for k,v in sorted(data.items(),key=itemgetter(1),reverse=True)}\n",
    "clusterID_df = pd.DataFrame(data).transpose()\n",
    "clusterID_df.columns = ['count']\n",
    "display(clusterID_df.transpose())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Review the Results\n",
    "Each time we apply kmeans, we may have some variation in the results. Developing some consistency in these results could align with the goals of our work. Below are some considerations.\n",
    "\n",
    "1.) Review the stability of distribution of the labels on the test set. If we re-run the process, does the distribution change dramatically?  \n",
    "2.) Review the \"meaning\" of the word loadings. Does the choice of k largely affect cluster terms?  \n",
    "3.) Consider new features. Start broad and then use SVD to narrow your selection. \n",
    "\n",
    "As a starting point, we will simply reiew the clusterID distribution using three different plotting methods:\n",
    "*  matplotlib\n",
    "*  BokehJS\n",
    "*  R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# review clusterID distribution \n",
    "display(clusterID_df)\n",
    "clusterID_df.plot(kind='bar',title='Cluster ID Distribution',rot=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# review clusterID distribution \n",
    "output_notebook()\n",
    "df = pd.DataFrame({'values': list(clusterID_df['count'].values), 'names':list(clusterID_df.transpose().columns.values)})\n",
    "display(df)\n",
    "bar = Bar(df, 'names', values='values', title=\"Cluster ID Distribution\", ylabel=\"Count\", xlabel=\"Cluster ID\")\n",
    "show(bar)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# push the variable to R\n",
    "df = pd.DataFrame({'values': [int(item) for item in list(clusterID_df['count'].values)], 'names':list(clusterID_df.transpose().columns.values)})\n",
    "display(df)\n",
    "%Rpush df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%%R \n",
    "ggplot(data=df,) + geom_bar(stat='identity',aes(x=names, y=values), color='white',fill='blue') + ggtitle(\"Cluster ID Distribution\")+ xlab('Cluster ID')+ylab('Count')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
